keep = ls()

########################################
#Create Indiana Soldiers Treatment Data#
########################################

#Get unique regiments
reg = fread(paste0(cwdb_path, '/regiments.csv')) %>% 
      .[state %in% "IN"]
reg = reg[, list(regimentpk, term_sort, type, orgdate, fullname)] %>% unique

#Remove units that existed only on paper
in_corrections = fread('./raw/in_units_missing.csv')
reg = rbindlist(list(reg[!is.na(term_sort) & !is.na(orgdate)], in_corrections), fill = T)
reg[is.na(failed), failed := 0]

#Read in roster, keep soldiers in Indiana Regiments
in_veterans_roster = fread(paste0(cwdb_path, '/regiment_roster_table.csv')) %>%
                      .[regimentpk %in% reg[failed %in% 0, regimentpk]]

#Get muster in/out date
in_veterans_roster[, mustindate := strptime(mustindate, "%Y-%m-%d") %>% as.Date]
in_veterans_roster[, outdate := strptime(outdate, "%Y-%m-%d") %>% as.Date]
#Clean Company Name
in_veterans_roster[,  unitcompany := unitcompany %>% str_replace("\\W", "") ] 
in_veterans_roster[!(unitcompany %in% c("", "Y")), c('companypk') := paste(regimentpk, unitcompany, sep = ":")]

#Read in persons data
in_veterans_cwdb = fread(paste0(cwdb_path, '/person_table.csv'), 
                         select = c("personpk", "enlistdate", "stateserved", 'waswounded', 'rank_broadcode', 'rank', 'broadrank', 'enlistrank', 'survivedwar') )  %>%
                  .[personpk %in% in_veterans_roster$personpk]

#get enlistment dates, rank
enlist_dates = in_veterans_cwdb[, list(personpk, enlistdate, enlistrank, survivedwar)]
enlist_dates[, enlistdate := strptime(enlistdate, "%Y-%m-%d") %>% as.Date]
enlist_dates[, enlist_year :=  year(enlistdate)]
enlist_dates[str_detect(enlistrank, "Lieut"), enlist_rank_clean := "Lieutenant" ]
enlist_dates[str_detect(enlistrank, "Serg"), enlist_rank_clean := "Sergeant" ]
enlist_dates[str_detect(enlistrank, "(Drummer)|(Fifer)|(Musician)"), enlist_rank_clean := "Musician" ]
enlist_dates[str_detect(enlistrank, "Corpl"), enlist_rank_clean := "Corporal" ]
enlist_dates[str_detect(enlistrank, "Captain"), enlist_rank_clean := "Captain" ]
enlist_dates[str_detect(enlistrank, "Priv"), enlist_rank_clean := "Private" ]
enlist_dates[str_detect(enlistrank, "Wagoner"), enlist_rank_clean := "Wagoner" ]
enlist_dates[, enlist_rank_clean_f := factor(enlist_rank_clean, 
                                          levels = c("Private", "Wagoner", "Musician", "Corporal", "Sergeant", "Lieutenant", "Captain"))]

#Keep first unit assignment for intent to treat
setkey(in_veterans_roster, personpk, mustindate)
in_treat_roster = in_veterans_roster[, list(regimentpk = regimentpk[1],
                                               unitcompany = unitcompany[1],
                                               companypk = companypk[1],
                                               indate = mustindate[1],
                                               discharged = any(outmethodsimple %in% "Discharged"),
                                               disabled = any(outmethodsimple %in% "Disabled"),
                                               deserted = any(outmethodsimple %in% "Deserted"),
                                               drafted = any(inmethodsimple %in% "Drafted"),
                                               substitute = any(inmethod %in% "Substitute"),
                                               outmethodsimple = outmethodsimple[1],
                                               inmethodsimple = inmethodsimple[1],
                                               outdate = outdate[1]) , by = personpk]

#Roster for soldiers in unit
in_roster = in_veterans_roster[, list(regimentpk = regimentpk,
                          unitcompany = unitcompany,
                          companypk,
                          indate = mustindate,
                          discharged = any(outmethodsimple %in% "Discharged"),
                          disabled = any(outmethodsimple %in% "Disabled"),
                          deserted = any(outmethodsimple %in% "Deserted"),
                          drafted = any(inmethodsimple %in% "Drafted"),
                          substitute = any(inmethod %in% "Substitute"),
                          outmethodsimple = outmethodsimple,
                          inmethodsimple = inmethodsimple,
                          outdate = outdate) , by = personpk]

#Merge in type of regiment, term of service, regiment organization date
setkey(in_treat_roster, regimentpk)
setkey(reg, regimentpk)
in_treat_roster = reg[in_treat_roster]
in_treat_roster[, orgdate := strptime(orgdate, "%Y-%m-%d") %>% as.Date]
in_treat_roster[is.na(term_sort), term_sort := 10030]

setkey(in_roster, regimentpk)
setkey(reg, regimentpk)
in_roster = reg[in_roster]
in_roster[, orgdate := strptime(orgdate, "%Y-%m-%d") %>% as.Date]
in_roster[is.na(term_sort), term_sort := 10030]

#Merge in enlistdate for ITT soldiers, in-company peers
setkey(in_treat_roster, personpk)
setkey(enlist_dates, personpk)
in_treat_roster = enlist_dates[in_treat_roster]

setkey(in_roster, personpk)
setkey(enlist_dates, personpk)
in_roster = enlist_dates[in_roster]

#Clean indate: if missing, use enlistment date, or modal indate of company
in_treat_roster[, modal_indate := {tab = indate %>% table;
                                   ifelse(length(tab) > 0,
                                          names(tab)[which.max(tab)] %>% as.Date,
                                          as.Date(NA)
                                          )
                                   } , by = list(regimentpk, unitcompany)]
in_treat_roster[is.na(indate), indate := enlistdate]
in_treat_roster[is.na(indate), indate := modal_indate]

in_roster[, modal_indate := {tab = indate %>% table;
                              ifelse(length(tab) > 0,
                                     names(tab)[which.max(tab)] %>% as.Date,
                                     as.Date(NA)
                              )
                              } , by = list(regimentpk, unitcompany)]
in_roster[is.na(indate), indate := enlistdate]
in_roster[is.na(indate), indate := modal_indate]

#Get list of persons to use; regiments to use
all_use_persons = in_treat_roster$personpk %>% unique
all_use_regiments = in_treat_roster[personpk %in% all_use_persons, unique(regimentpk)]


############################
#Get within-unit experience#
############################

in_treat_roster[, died := outmethodsimple %in% c('Died of disease', 'Died POW', 'Killed/Died of wounds')]
in_treat_roster[, kia := outmethodsimple %in% c('Died POW', 'Killed/Died of wounds')]
in_treat_roster[, wounded := personpk %in% in_veterans_cwdb[waswounded == 'Y', personpk]]
in_treat_roster[, expected_out := indate + (term_sort - 10000)]
in_treat_roster[, expected_out_alt := orgdate + (term_sort - 10000)]
in_treat_roster[, joined_at_org := as.numeric(indate - orgdate) %>% abs %>% `<` (100)]

in_roster[, died := outmethodsimple %in% c('Died of disease', 'Died POW', 'Killed/Died of wounds')]
in_roster[, kia := outmethodsimple %in% c('Died POW', 'Killed/Died of wounds')]
in_roster[, wounded := personpk %in% in_veterans_cwdb[waswounded == 'Y', personpk]]
in_roster[, expected_out := indate + (term_sort - 10000)]
in_roster[, expected_out_alt := orgdate + (term_sort - 10000)]
in_roster[, joined_at_org := as.numeric(indate - orgdate) %>% abs %>% `<` (100)]

#Get usable sample:
#no missing date informat
#no missing company name
in_treat_roster[, useTime := !is.na(indate) & !is.na(expected_out)]
in_treat_roster[, useTime_alt := joined_at_org & !is.na(expected_out_alt)]
in_treat_roster[, useCompany := !is.na(companypk)]

in_roster[, useTime := !is.na(indate) & !is.na(expected_out)]
in_roster[, useTime_alt := joined_at_org & !is.na(expected_out_alt)]
in_roster[, useCompany := !is.na(companypk)]

#Get company casualty rate
##########################
a = in_treat_roster[(useTime) & (useCompany), list(personpk, companypk, died, kia, disabled, outmethodsimple, indate, expected_out)]
b = in_roster[(useTime) & (useCompany), list(personpk, companypk, died, kia, disabled, outmethodsimple, indate, expected_out)]

setkey(a, companypk, indate, expected_out)
setkey(b, companypk, indate, expected_out)

company_casualty_rate = foverlaps(na.omit(a), na.omit(b), by.x = c('companypk', 'indate', 'expected_out'), by.y = c('companypk', 'indate', 'expected_out')) %>% 
  .[personpk != i.personpk, list(in_company_n = .N, 
                                 company_deaths = sum(i.died),
                                 company_kia = sum(i.kia),
                                 company_disabled = sum(i.disabled)), by = list(personpk, companypk)]

rm(list = c('a', 'b'))
gc()

#Get regimental casualty rate
#############################

u_regiments = in_treat_roster$regimentpk %>% unique %>% na.omit
r_list = vector('list', length = length(u_regiments))

for (i in seq_along(u_regiments)) {
  r = u_regiments[i]
  a = in_treat_roster[(useTime) & regimentpk %in% r, list(personpk, regimentpk, died, kia, disabled, indate, expected_out)]
  b = in_roster[(useTime) & regimentpk %in% r, list(personpk, regimentpk, died, kia, disabled, indate, expected_out)]
  setkey(a, regimentpk, indate, expected_out)
  setkey(b, regimentpk, indate, expected_out)
  
  r_list[[i]] = foverlaps(na.omit(a), na.omit(b), by.x = c('regimentpk',  'indate', 'expected_out'), by.y = c('regimentpk',  'indate', 'expected_out')) %>% 
                                  .[personpk != i.personpk, list(in_regiment_n = .N, 
                                   regiment_deaths = sum(i.died),
                                   regiment_kia = sum(i.kia),
                                   regiment_disabled = sum(i.disabled )), by = list(personpk, regimentpk)]
}

regiment_casualty_rate = rbindlist(r_list, fill = T) 

rm(list = c('a', 'b', 'r_list'))
gc()
  
################################################
#Get alternate regimental casualty / combat data
################################################

#Get regimental casualty records

regcas = fread(paste0(cwdb_path, '/regcas.csv'))[regimentpk %in% all_use_regiments]
regcas[, indate := strptime(date, "%Y-%m-%d") %>% as.Date]
regcas[, expected_out := indate]


a = in_treat_roster[(useTime) & !is.na(regimentpk), list(personpk, regimentpk, indate, expected_out)]
b = regcas[!is.na(indate) & !is.na(regimentpk), list(regimentpk, indate, expected_out, killed, wounded, pow, missing, battlenum)]

setkey(b, regimentpk, indate, expected_out)

regiment_combat_data = foverlaps(a, b, by.x = c('regimentpk', 'indate', 'expected_out'), by.y = c('regimentpk', 'indate', 'expected_out')) %>%
  .[, list(combat_days = na.omit(indate) %>% unique %>% length,
           kia = sum(killed, na.rm = T),
           wia = sum(wounded, na.rm = T),
           pow = sum(pow, na.rm = T), 
           mia = sum(missing, na.rm = T),
           battles = na.omit(battlenum) %>% unique %>% length), by = list(personpk, regimentpk)]
rm(list = c('a', 'b'))
gc()


#################################
#Merge experience to roster data#
#################################

#Merge in company casualty rates
setkey(in_treat_roster, personpk, companypk)
setkey(company_casualty_rate, personpk, companypk)

in_treat_roster = company_casualty_rate[in_treat_roster]

#Merge in regiment casualty rates
setkey(in_treat_roster, personpk, regimentpk)
setkey(regiment_casualty_rate, personpk, regimentpk)

in_treat_roster = regiment_casualty_rate[in_treat_roster]

#merge in regiment combat experience
setkey(in_treat_roster, personpk, regimentpk)
setkey(regiment_combat_data, personpk, regimentpk)

in_treat_roster = regiment_combat_data[in_treat_roster]

#Save
fwrite(in_treat_roster, './cleaned/in_cwdb_treatment_data.csv')

#Cleanup
rm(list = setdiff(ls(), c(keep, 'keep')))
gc()